Pandas for Data Analysis


Attribution: Smithsonian's National Zoo

Python Frederick

Speaker: Chris Malec

Pandas¶

  • Created for Data analysis
  • Centered around a DataFrame object
  • Supports indexing, merging, and transforming data
  • Integrates well with other common libraries
    • matplotlib
    • plotly
    • sci-kit learn
    • numpy

Creating Dataframes¶

In [1]:
#create dataframe from lists
import pandas as pd
letters = ['a','b','c','d']#some letters
numbers = [1,2,3,4]#some numbers
lists_df = pd.DataFrame(data={'letters':letters,
                              'numbers':numbers})#data is a dictionary of lists
display(lists_df)
letters numbers
0 a 1
1 b 2
2 c 3
3 d 4
In [2]:
#create dataframe from numpy array
import numpy as np
numpy_data = np.array([1,2,3,4,5,6,7,8]).reshape(4,2) #a 4x2 array
numpy_df = pd.DataFrame(data=numpy_data,columns = ['column_1','column_2'])#data is a numpy array
display(numpy_df)
column_1 column_2
0 1 2
1 3 4
2 5 6
3 7 8
In [3]:
#create dataframe from records, or a list of rows
records = [('a',1),('b',2),('c',3),('d',4)]#a list of tuples
records_df = pd.DataFrame.from_records(records,
                                       columns = ['letters','numbers'])#data is a list of records
display(records_df)
letters numbers
0 a 1
1 b 2
2 c 3
3 d 4
In [4]:
#we will use this dataframe for the next few examples
ex_df = pd.DataFrame.from_records(records,
                                  columns = ['letters','numbers'],
                                  index = ['first','second','third','fourth'])#add an index
display(ex_df)
letters numbers
first a 1
second b 2
third c 3
fourth d 4

Selecting columns¶

In [5]:
# .loc is the primary way to locate both rows and columns
display(ex_df.loc[:,['letters']]) #dataframe
letters
first a
second b
third c
fourth d
In [6]:
display(ex_df.loc[:,'letters']) #series
first     a
second    b
third     c
fourth    d
Name: letters, dtype: object
In [7]:
display(ex_df.loc[:,[col for col in ex_df.columns if 'l' in col]])#select columns based on column name
letters
first a
second b
third c
fourth d

Selecting rows¶

In [8]:
display(ex_df.loc[['first','third'],:])#dataframe
letters numbers
first a 1
third c 3
In [9]:
display(ex_df.loc['first',:])#series
letters    a
numbers    1
Name: first, dtype: object
In [10]:
display(ex_df.loc[ex_df['numbers']<=2,:])#filter rows based on number column
letters numbers
first a 1
second b 2
In [11]:
display(ex_df.loc[ex_df.numbers<=2,:])#same thing, different way of specifying series
letters numbers
first a 1
second b 2

Applying functions¶

In [13]:
#apply functions
def times_two(num):
    return 2*num

def to_upper(lett):
    return lett.upper()

transformed_df = ex_df.apply({'letters':to_upper,'numbers':times_two})
display_side_by_side(ex_df,transformed_df,titles = ['Original','Transformed'])

Original

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Transformed

letters numbers
first A 2
second B 4
third C 6
fourth D 8

Combining tables¶

In [14]:
#create two tables to merge
ex_df2 = ex_df.apply({'numbers':times_two,'letters':lambda x:x}).rename(columns={'numbers':'2x_numbers'})
display_side_by_side(ex_df,ex_df2,titles = ['Original','Transformed'])

Original

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Transformed

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d
In [15]:
#merge on the 'letters' column
merged_df = ex_df.merge(ex_df2,on='letters')
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Merged

letters numbers 2x_numbers
0 a 1 2
1 b 2 4
2 c 3 6
3 d 4 8
In [16]:
#join operates on indexes instead of columns
joined_df = ex_df.join(ex_df2,lsuffix='_l',rsuffix='_r')
display_side_by_side(ex_df,ex_df2,joined_df, titles = ['Left','Right','Joined'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Joined

letters_l numbers 2x_numbers letters_r
first a 1 2 a
second b 2 4 b
third c 3 6 c
fourth d 4 8 d
In [17]:
#can specify different left and right merge columns
merged_df = ex_df.merge(ex_df2,left_on='letters',right_on='letters')
display_side_by_side(ex_df,ex_df2,merged_df, titles = ['Left','Right','Merged'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Merged

letters numbers 2x_numbers
0 a 1 2
1 b 2 4
2 c 3 6
3 d 4 8
In [18]:
#Default is 'inner' join, so unmatched keys are dropped
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers')
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Merged

letters_x numbers 2x_numbers letters_y
0 b 2 2 a
1 d 4 4 b
In [19]:
#You can specify the suffixes that appear when columns get duplicated
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers',suffixes=('_l','_r'))
display_side_by_side(ex_df,ex_df2,merged_df,titles = ['Left','Right','Merged'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Merged

letters_l numbers 2x_numbers letters_r
0 b 2 2 a
1 d 4 4 b
In [20]:
#Specifying 'right' join causes *all* rows in the right dataframe to appear, 
#and fill in na's if they are unmatched by the left dataframe
merged_df = ex_df.merge(ex_df2,left_on='numbers',right_on='2x_numbers',suffixes=('_l','_r'),how='right')
display_side_by_side(ex_df,ex_df2,merged_df, titles = ['Left','Right','Merged'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

2x_numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Merged

letters_l numbers 2x_numbers letters_r
0 b 2.0 2 a
1 d 4.0 4 b
2 NaN NaN 6 c
3 NaN NaN 8 d
In [21]:
#Concatenation is more like a union
ex_df2 = ex_df.apply({'numbers':times_two,'letters':lambda x:x})
concatenated_df = pd.concat([ex_df,ex_df2],axis=0,ignore_index=True)
display_side_by_side(ex_df,ex_df2,concatenated_df,titles = ['Left','Right','Concatenated'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Concatenated

letters numbers
0 a 1
1 b 2
2 c 3
3 d 4
4 a 2
5 b 4
6 c 6
7 d 8
In [22]:
#concatenation along rows
concatenated_df = pd.concat([ex_df,ex_df2],axis=1)
display_side_by_side(ex_df,ex_df2,concatenated_df,titles = ['Left','Right','Concatenated'])

Left

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Right

numbers letters
first 2 a
second 4 b
third 6 c
fourth 8 d

Concatenated

letters numbers numbers letters
first a 1 2 a
second b 2 4 b
third c 3 6 c
fourth d 4 8 d
In [23]:
#what happens when you duplicated columns
display(concatenated_df['letters'])
letters letters
first a a
second b b
third c c
fourth d d

Plotting data¶

In [24]:
#create plots
#from dataframe

import matplotlib.pyplot as plt
ex_df.set_index('letters').plot(kind='bar')
plt.show()
In [25]:
#with plotly
import plotly.express as px
fig = px.bar(ex_df,
             x = 'letters',
             y = 'numbers')
fig.update_layout(width=600,height=400)
fig.show()

Summarizing tables¶

In [26]:
#create summary tables
summary_df = ex_df.describe()
display_side_by_side(ex_df,summary_df,titles = ['Original','Summary'])

Original

letters numbers
first a 1
second b 2
third c 3
fourth d 4

Summary

numbers
count 4.000000
mean 2.500000
std 1.290994
min 1.000000
25% 1.750000
50% 2.500000
75% 3.250000
max 4.000000
In [27]:
#column summaries
summary_df = ex_df.agg({'numbers':sum,'letters':min})
display(ex_df,summary_df)
letters numbers
first a 1
second b 2
third c 3
fourth d 4
numbers    10
letters     a
dtype: object
In [28]:
#groupby
ex_group_df = pd.concat([ex_df,
                         ex_df.shift(1).bfill().rename(index = dict((k,k+'_1') for k in ex_df.index)),
                         ex_df.shift(-1).ffill().rename(index = dict((k,k+'_-1') for k in ex_df.index)),
                         ex_df.shift(2).fillna(0).rename(index = dict((k,k+'_2') for k in ex_df.index))])

display(ex_group_df)
letters numbers
first a 1.0
second b 2.0
third c 3.0
fourth d 4.0
first_1 a 1.0
second_1 a 1.0
third_1 b 2.0
fourth_1 c 3.0
first_-1 b 2.0
second_-1 c 3.0
third_-1 d 4.0
fourth_-1 d 4.0
first_2 0 0.0
second_2 0 0.0
third_2 a 1.0
fourth_2 b 2.0
In [29]:
grouped_df = ex_group_df.groupby('letters').sum()#sum the numbers that correspond to a particular letter
sorted_ex_group_df = ex_group_df.astype({'letters':'string'}).sort_values('letters')#make all entries strings and sort
display_side_by_side(sorted_ex_group_df,grouped_df,titles = ['Original','Grouped'])

Original

letters numbers
first_2 0 0.0
second_2 0 0.0
first a 1.0
first_1 a 1.0
second_1 a 1.0
third_2 a 1.0
second b 2.0
third_1 b 2.0
first_-1 b 2.0
fourth_2 b 2.0
third c 3.0
fourth_1 c 3.0
second_-1 c 3.0
fourth d 4.0
third_-1 d 4.0
fourth_-1 d 4.0

Grouped

numbers
letters
0 0.0
a 4.0
b 8.0
c 9.0
d 12.0
In [30]:
#multiple aggregations
grouped_df = ex_group_df.groupby('letters').agg({'numbers':[min,max,sum,'count']})#create different aggregations for the same column
sorted_ex_group_df = ex_group_df.astype({'letters':'string'}).sort_values('letters')
display_side_by_side(sorted_ex_group_df,grouped_df,titles = ['Original','Multiple Aggregations '])

Original

letters numbers
first_2 0 0.0
second_2 0 0.0
first a 1.0
first_1 a 1.0
second_1 a 1.0
third_2 a 1.0
second b 2.0
third_1 b 2.0
first_-1 b 2.0
fourth_2 b 2.0
third c 3.0
fourth_1 c 3.0
second_-1 c 3.0
fourth d 4.0
third_-1 d 4.0
fourth_-1 d 4.0

Multiple Aggregations

numbers
min max sum count
letters
0 0.0 0.0 0.0 2
a 1.0 1.0 4.0 4
b 2.0 2.0 8.0 4
c 3.0 3.0 9.0 3
d 4.0 4.0 12.0 3
In [31]:
#Indices and columns can have arbitrary number of levels
grouped_df = ex_group_df.groupby('letters').agg({'numbers':[min,max,sum,'count']})
print('Hierarchical: ',grouped_df.columns)
grouped_df = grouped_df.reset_index()
grouped_df2 = grouped_df.copy()#this is important
grouped_df2.columns = grouped_df2.columns.droplevel()
print('Flat: ',grouped_df2.columns)
grouped_df2 = grouped_df2.reset_index().rename(columns={'':'letters'})
display_side_by_side(grouped_df,grouped_df2,titles = ['Hierarchical','Flat'])
Hierarchical:  MultiIndex([('numbers',   'min'),
            ('numbers',   'max'),
            ('numbers',   'sum'),
            ('numbers', 'count')],
           )
Flat:  Index(['', 'min', 'max', 'sum', 'count'], dtype='object')

Hierarchical

letters numbers
min max sum count
0 0 0.0 0.0 0.0 2
1 a 1.0 1.0 4.0 4
2 b 2.0 2.0 8.0 4
3 c 3.0 3.0 9.0 3
4 d 4.0 4.0 12.0 3

Flat

index letters min max sum count
0 0 0 0.0 0.0 0.0 2
1 1 a 1.0 1.0 4.0 4
2 2 b 2.0 2.0 8.0 4
3 3 c 3.0 3.0 9.0 3
4 4 d 4.0 4.0 12.0 3
In [32]:
#melt a dataframe, make several columns into two
melted_df = grouped_df2.melt(id_vars = 'letters',
                             value_vars = ['min','max','sum','count'],
                             var_name = 'aggregation',
                             value_name = 'value')

#pivot dataframe, make two or more columns into many
pivot_df = melted_df.pivot(index = 'letters',
                           columns = 'aggregation',
                           values = 'value')

display_side_by_side(grouped_df2,melted_df,pivot_df,titles = ['Original','Melted','Pivoted'])

Original

index letters min max sum count
0 0 0 0.0 0.0 0.0 2
1 1 a 1.0 1.0 4.0 4
2 2 b 2.0 2.0 8.0 4
3 3 c 3.0 3.0 9.0 3
4 4 d 4.0 4.0 12.0 3

Melted

letters aggregation value
0 0 min 0.0
1 a min 1.0
2 b min 2.0
3 c min 3.0
4 d min 4.0
5 0 max 0.0
6 a max 1.0
7 b max 2.0
8 c max 3.0
9 d max 4.0
10 0 sum 0.0
11 a sum 4.0
12 b sum 8.0
13 c sum 9.0
14 d sum 12.0
15 0 count 2.0
16 a count 4.0
17 b count 4.0
18 c count 3.0
19 d count 3.0

Pivoted

aggregation count max min sum
letters
0 2.0 0.0 0.0 0.0
a 4.0 1.0 1.0 4.0
b 4.0 2.0 2.0 8.0
c 3.0 3.0 3.0 9.0
d 3.0 4.0 4.0 12.0

Real Life¶

Concerns of real data¶

  • Null values are a lot more important
  • A lot of (sometimes non-sensical) datatypes
  • Just getting it from file to dataframe is probably half the battle
  • The size can easily get out of hand (keep it under about 5 million rows for pandas)

Today's example data: IEX order book data¶

  • What is IEX?
  • What is an order book?
  • What is market manipulation?
  • What is Reg NMS?
  • How can anyone find manipulation?

I don't intend to find any manipulation in this data set or make any 'grand conclusions,' but hopefully you'll see some useful things you can do with pandas and get a neat view of an important week in the equities market.

What is IEX?¶

The Investor's Exchange is one of the 15 or so independent stock exchanges that operate in the U.S. It was featured in the Michael Lewis book Flash Boys. They do their best to offer access to the market with protections in place that favor long-term over short-term strategies. They're pretty small compared to NYSE or NASDAQ, but they allow you to download pretty detailed historical data for free, even if it is a dump of network packets.

What is an order book?¶

Every time a broker wants to make a trade, they put an an order (I want to buy 200 shares of stock A for $100 each). Until the order is canceled, or a seller is found that is willing to meet the buyer at their price, the order 'rests on the book.' Most heavily traded stocks do not have orders that rest long, unless the number of shares is exceptionally large or the price point is completely unreasonable.

time side quantity price
9:30 B 100 45.00
9:31 B 200 45.01
9:31 S 100 45.05
9:32 B 50 45.00

What is market manipulation?¶

This is relatively subjective, but the crux of many types of manipulation is 'are you engaging with the market in good faith?' In other words, are you placing orders onto the books because you would like to buy a stock at the price you stated (or at least hope someone will sell to you at that price), or are you sending false signals to manipulate at what prices people are willing to buy/sell.

time side quantity price firm
9:30 B 100 45.00 A
9:31 B 200 45.01 A
9:31 S 100 45.05 B
9:32 B 50 45.00 A
9:33 S 100 45.04 Manip
9:33 S 200 45.04 Manip
9:33 S 500 45.03 Manip
9:33 S 100 45.05 Manip
9:33 S 400 45.03 Manip
9:33 S 100 45.03 Manip
9:34 S 600 45.02 IllBite
9:34 B 600 45.02 Manip

What is Reg NMS¶

Reg NMS is an SEC rule that created the 'National Market System.' Stock exchanges became publicly traded companies and firms routing orders to the markets gained the obligation of 'best execution.' This generally means that if NYSE's order book has a stock selling at 45.05 and NASDAQ's order book has the same stock selling at 45.04, your buy order with Charles Schwab account needs to take the shares at NASDAQ at the better price.

The regulation is intended to keep brokerages from giving customers a bad deal, but it also created a complicated set of rules that can be gamed.

How can anyone find manipulation?¶

Short answer: Regulators get to see much more detailed information than everyone else. It can be pretty difficult even with the extra information.

In [33]:
manip_df = pd.read_csv('example_manip.csv',
                       sep=',',
                       header=1)
display(manip_df)
Time Firm Side Price Quantity Order ID Action NBO NBB
0 10/13/2021 9:31:00.0 AM Firm S S 50.05 1000 1 Order 50.05 49.95
1 10/13/2021 9:31:00.0 AM Firm B B 49.95 1000 2 Order 50.05 49.95
2 10/13/2021 9:31:00.2 AM Firm M S 50.04 200 3 Order 50.04 49.95
3 10/13/2021 9:31:01.5 AM Firm M S 50.03 100 4 Order 50.03 49.95
4 10/13/2021 9:31:03.0 AM Firm M S 50.03 100 4 Cancel 50.04 49.95
5 10/13/2021 9:31:03.2 AM Firm M S 50.01 150 5 Order 50.01 49.95
6 10/13/2021 9:31:03.3 AM Firm M S 50.00 300 6 Order 50.00 49.95
7 10/13/2021 9:31:04.5 AM Firm M S 50.03 500 7 Order 50.00 49.95
8 10/13/2021 9:31:04.6 AM Firm M S 50.04 200 8 Order 50.00 49.95
9 10/13/2021 9:31:05.7 AM Firm M S 49.96 200 9 Order 49.96 49.95
10 10/13/2021 9:31:05.7 AM Firm M S 50.04 100 8 Cancel 49.96 49.95
11 10/13/2021 9:31:05.5 AM Firm M S 49.97 600 10 Order 49.96 49.95
12 10/13/2021 9:31:05.6 AM Firm M S 49.96 300 11 Order 49.96 49.95
13 10/13/2021 9:31:06.8 AM Firm S S 49.96 1000 12 Order 49.96 49.95
14 10/13/2021 9:31:06.9 AM Firm M B 49.96 1000 13 Order 49.96 49.96
15 10/13/2021 9:31:06.9 AM Firm M X 49.96 1000 12_13 Trade 49.96 49.95
16 10/13/2021 9:31:07.0 AM Firm M S 49.96 300 11 Cancel 49.97 49.95
17 10/13/2021 9:31:07.0 AM Firm M S 49.97 600 10 Cancel 50.00 49.95
18 10/13/2021 9:31:07.0 AM Firm M S 49.96 200 9 Cancel 50.00 49.95
19 10/13/2021 9:31:07.0 AM Firm M S 50.04 100 8 Cancel 50.00 49.95
20 10/13/2021 9:31:07.0 AM Firm M S 50.03 500 7 Cancel 50.00 49.95
21 10/13/2021 9:31:07.0 AM Firm M S 50.00 300 6 Cancel 50.00 49.95
22 10/13/2021 9:31:07.0 AM Firm M S 50.01 150 5 Cancel 50.04 49.95
23 10/13/2021 9:31:07.0 AM Firm M S 50.04 200 3 Cancel 50.05 49.95
In [34]:
def string_to_timestamp(string):
    timestamp_components = string.split(' ')
    timestamp_string = 'T'.join(timestamp_components[:2])
    return pd.to_datetime(timestamp_string)
manip_df['timestamp'] = manip_df['Time'].apply(string_to_timestamp)
manip_df
Out[34]:
Time Firm Side Price Quantity Order ID Action NBO NBB timestamp
0 10/13/2021 9:31:00.0 AM Firm S S 50.05 1000 1 Order 50.05 49.95 2021-10-13 09:31:00.000
1 10/13/2021 9:31:00.0 AM Firm B B 49.95 1000 2 Order 50.05 49.95 2021-10-13 09:31:00.000
2 10/13/2021 9:31:00.2 AM Firm M S 50.04 200 3 Order 50.04 49.95 2021-10-13 09:31:00.200
3 10/13/2021 9:31:01.5 AM Firm M S 50.03 100 4 Order 50.03 49.95 2021-10-13 09:31:01.500
4 10/13/2021 9:31:03.0 AM Firm M S 50.03 100 4 Cancel 50.04 49.95 2021-10-13 09:31:03.000
5 10/13/2021 9:31:03.2 AM Firm M S 50.01 150 5 Order 50.01 49.95 2021-10-13 09:31:03.200
6 10/13/2021 9:31:03.3 AM Firm M S 50.00 300 6 Order 50.00 49.95 2021-10-13 09:31:03.300
7 10/13/2021 9:31:04.5 AM Firm M S 50.03 500 7 Order 50.00 49.95 2021-10-13 09:31:04.500
8 10/13/2021 9:31:04.6 AM Firm M S 50.04 200 8 Order 50.00 49.95 2021-10-13 09:31:04.600
9 10/13/2021 9:31:05.7 AM Firm M S 49.96 200 9 Order 49.96 49.95 2021-10-13 09:31:05.700
10 10/13/2021 9:31:05.7 AM Firm M S 50.04 100 8 Cancel 49.96 49.95 2021-10-13 09:31:05.700
11 10/13/2021 9:31:05.5 AM Firm M S 49.97 600 10 Order 49.96 49.95 2021-10-13 09:31:05.500
12 10/13/2021 9:31:05.6 AM Firm M S 49.96 300 11 Order 49.96 49.95 2021-10-13 09:31:05.600
13 10/13/2021 9:31:06.8 AM Firm S S 49.96 1000 12 Order 49.96 49.95 2021-10-13 09:31:06.800
14 10/13/2021 9:31:06.9 AM Firm M B 49.96 1000 13 Order 49.96 49.96 2021-10-13 09:31:06.900
15 10/13/2021 9:31:06.9 AM Firm M X 49.96 1000 12_13 Trade 49.96 49.95 2021-10-13 09:31:06.900
16 10/13/2021 9:31:07.0 AM Firm M S 49.96 300 11 Cancel 49.97 49.95 2021-10-13 09:31:07.000
17 10/13/2021 9:31:07.0 AM Firm M S 49.97 600 10 Cancel 50.00 49.95 2021-10-13 09:31:07.000
18 10/13/2021 9:31:07.0 AM Firm M S 49.96 200 9 Cancel 50.00 49.95 2021-10-13 09:31:07.000
19 10/13/2021 9:31:07.0 AM Firm M S 50.04 100 8 Cancel 50.00 49.95 2021-10-13 09:31:07.000
20 10/13/2021 9:31:07.0 AM Firm M S 50.03 500 7 Cancel 50.00 49.95 2021-10-13 09:31:07.000
21 10/13/2021 9:31:07.0 AM Firm M S 50.00 300 6 Cancel 50.00 49.95 2021-10-13 09:31:07.000
22 10/13/2021 9:31:07.0 AM Firm M S 50.01 150 5 Cancel 50.04 49.95 2021-10-13 09:31:07.000
23 10/13/2021 9:31:07.0 AM Firm M S 50.04 200 3 Cancel 50.05 49.95 2021-10-13 09:31:07.000
In [35]:
import plotly.express as px
import plotly.graph_objects as go
fig = px.scatter(manip_df,
                 x='timestamp',
                 y='Price',
                 symbol='Action',
                 color='Firm',
                 size='Quantity',
                 color_discrete_sequence=['goldenrod','aqua','lightgreen'],
                 opacity=.5)

fig.add_trace(go.Scatter(x=manip_df['timestamp'],
                         y=manip_df['NBO'],
                         mode='lines',
                         line_color='magenta',
                         line_shape='hv',
                         name='Best Bid'))

fig.add_trace(go.Scatter(x=manip_df['timestamp'],
                         y=manip_df['NBB'],
                         mode='lines',
                         line_color='cyan',
                         line_shape='hv',
                         name='Best Offer'))
fig.update_layout(template='plotly_dark');
In [41]:
from IPython.display import Image
In [43]:
Image(fig.to_image(format="png"))
Out[43]:

IEX Order Book for GME 1/22 - 1/28¶

In [45]:
!xxd -l 112 -s 20000 ~/Downloads/data_feeds_20210122_20210122_IEXTP1_DEEP1.0.pcap
00004e20: 7cb9 0500 ed64 2f58 e805 0000 e805 0000  |....d/X........
00004e30: 0100 5e57 1504 b859 9ff9 2d53 0800 4500  ..^W...Y..-S..E.
00004e40: 05da e340 4000 4011 9f90 17e2 9b84 e9d7  ...@@.@.........
00004e50: 1504 288a 288a 05c6 0402 0100 0480 0100  ..(.(...........
00004e60: 0000 0000 d948 9605 4200 fc42 0000 0000  .....H..B..B....
00004e70: 0000 1903 0000 0000 0000 f76d ff74 b88d  ...........m.t..
00004e80: 5c16 1600 4854 c51f ff74 b88d 5c16 4149  \...HT...t..\.AI
In [46]:
#A lovely command line conversion of the binary to a json file
#!pip install iex_parser
#!iex_to_json -i iex_file.pcap.gz -o iex_file.json.gz -t 'GME' -s
In [48]:
import json
with open('iex_deep_quotes_and_trades.json') as file:
    line = file.readline()
    print(line[:500])
[{"type":"trade_report","event":null,"timestamp":"2021-01-22T13:02:31.215300+00:00","status":null,"symbol":"GME","detail":null,"halt_status":null,"reason":null,"flags":96.0,"size":39.0,"price":45.19,"trade_id":2067095.0,"side":null,"security_event":null},{"type":"trade_report","event":null,"timestamp":"2021-01-22T13:08:14.700160+00:00","status":null,"symbol":"GME","detail":null,"halt_status":null,"reason":null,"flags":96.0,"size":50.0,"price":44.87,"trade_id":2639914.0,"side":null,"security_even
In [49]:
#Read in json file, many json files that have a schema without 
#a lot of nesting and variation can be read safely as records
from pandas.io.json import read_json

json_df = read_json('iex_deep_quotes_and_trades.json',orient='records')
display(json_df)
type event timestamp status symbol detail halt_status reason flags size price trade_id side security_event
0 trade_report NaN 2021-01-22 13:02:31.215300+00:00 NaN GME NaN NaN NaN 96 39 45.19 2.067095e+06 None NaN
1 trade_report NaN 2021-01-22 13:08:14.700160+00:00 NaN GME NaN NaN NaN 96 50 44.87 2.639914e+06 None NaN
2 trade_report NaN 2021-01-22 13:11:52.294756+00:00 NaN GME NaN NaN NaN 96 50 44.58 3.063945e+06 None NaN
3 trade_report NaN 2021-01-22 13:18:22.383301+00:00 NaN GME NaN NaN NaN 96 50 44.04 3.669247e+06 None NaN
4 trade_report NaN 2021-01-22 13:19:09.002873+00:00 NaN GME NaN NaN NaN 96 10 43.78 3.739332e+06 None NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
490763 price_level_update NaN 2021-01-28 21:59:06.793899+00:00 NaN GME NaN NaN NaN 1 0 262.00 NaN S NaN
490764 price_level_update NaN 2021-01-28 21:59:06.797557+00:00 NaN GME NaN NaN NaN 1 100 262.00 NaN S NaN
490765 trade_report NaN 2021-01-28 21:59:08.241677+00:00 NaN GME NaN NaN NaN 96 19 262.00 3.027362e+09 None NaN
490766 price_level_update NaN 2021-01-28 21:59:08.241677+00:00 NaN GME NaN NaN NaN 1 0 262.00 NaN S NaN
490767 price_level_update NaN 2021-01-28 22:00:00.018339+00:00 NaN GME NaN NaN NaN 1 0 988.00 NaN S NaN

490768 rows × 14 columns

In [50]:
#see what the na situation is
json_df.isna().sum()
Out[50]:
type                   0
event             490768
timestamp              0
status            490768
symbol                 0
detail            490768
halt_status       490768
reason            490768
flags                  0
size                   0
price                  0
trade_id          356056
side              134712
security_event    490768
dtype: int64
In [52]:
#get rid of columns that are entirely null
json_df = json_df.dropna(axis = 1,how='all')
display(json_df)
type timestamp symbol flags size price trade_id side
0 trade_report 2021-01-22 13:02:31.215300+00:00 GME 96 39 45.19 2.067095e+06 None
1 trade_report 2021-01-22 13:08:14.700160+00:00 GME 96 50 44.87 2.639914e+06 None
2 trade_report 2021-01-22 13:11:52.294756+00:00 GME 96 50 44.58 3.063945e+06 None
3 trade_report 2021-01-22 13:18:22.383301+00:00 GME 96 50 44.04 3.669247e+06 None
4 trade_report 2021-01-22 13:19:09.002873+00:00 GME 96 10 43.78 3.739332e+06 None
... ... ... ... ... ... ... ... ...
490763 price_level_update 2021-01-28 21:59:06.793899+00:00 GME 1 0 262.00 NaN S
490764 price_level_update 2021-01-28 21:59:06.797557+00:00 GME 1 100 262.00 NaN S
490765 trade_report 2021-01-28 21:59:08.241677+00:00 GME 96 19 262.00 3.027362e+09 None
490766 price_level_update 2021-01-28 21:59:08.241677+00:00 GME 1 0 262.00 NaN S
490767 price_level_update 2021-01-28 22:00:00.018339+00:00 GME 1 0 988.00 NaN S

490768 rows × 8 columns

In [53]:
#What are we left with?
json_df.isna().sum()
Out[53]:
type              0
timestamp         0
symbol            0
flags             0
size              0
price             0
trade_id     356056
side         134712
dtype: int64
In [54]:
#What data types are we working with?
json_df.dtypes
Out[54]:
type                      object
timestamp    datetime64[ns, UTC]
symbol                    object
flags                      int64
size                       int64
price                    float64
trade_id                 float64
side                      object
dtype: object
In [55]:
#The objects really should be strings
json_df = json_df.astype({'type':'string','symbol':'string','side':'string'})
json_df.dtypes
Out[55]:
type                      string
timestamp    datetime64[ns, UTC]
symbol                    string
flags                      int64
size                       int64
price                    float64
trade_id                 float64
side                      string
dtype: object
In [56]:
#Fill in nulls on the side, since that may cause trouble plotting trades
#create a date column for filtering purposes, and change from UTC to EST
from datetime import timezone
json_df = json_df.fillna({'side':'X'})#replace nulls
json_df['date'] = json_df.apply({'timestamp':lambda x: x.date})#a column with just the date can be useful
json_df['timestamp'] = json_df['timestamp'].apply(lambda x: x.astimezone(tz='EST').replace(tzinfo=None))#change to local time
display(json_df)
type timestamp symbol flags size price trade_id side date
0 trade_report 2021-01-22 08:02:31.215300 GME 96 39 45.19 2.067095e+06 X 2021-01-22
1 trade_report 2021-01-22 08:08:14.700160 GME 96 50 44.87 2.639914e+06 X 2021-01-22
2 trade_report 2021-01-22 08:11:52.294756 GME 96 50 44.58 3.063945e+06 X 2021-01-22
3 trade_report 2021-01-22 08:18:22.383301 GME 96 50 44.04 3.669247e+06 X 2021-01-22
4 trade_report 2021-01-22 08:19:09.002873 GME 96 10 43.78 3.739332e+06 X 2021-01-22
... ... ... ... ... ... ... ... ... ...
490763 price_level_update 2021-01-28 16:59:06.793899 GME 1 0 262.00 NaN S 2021-01-28
490764 price_level_update 2021-01-28 16:59:06.797557 GME 1 100 262.00 NaN S 2021-01-28
490765 trade_report 2021-01-28 16:59:08.241677 GME 96 19 262.00 3.027362e+09 X 2021-01-28
490766 price_level_update 2021-01-28 16:59:08.241677 GME 1 0 262.00 NaN S 2021-01-28
490767 price_level_update 2021-01-28 17:00:00.018339 GME 1 0 988.00 NaN S 2021-01-28

490768 rows × 9 columns

In [57]:
#Create subset that is just the order book updates
mask = json_df['type']=='price_level_update'
select_cols = ['timestamp','size','price','side']
sort_cols = ['timestamp','price']
order_df = json_df.loc[mask,select_cols].sort_values(sort_cols)
In [58]:
class PriceLevels(dict):
    def ignore_item(self,item):
        return self
    
    def add_or_discard(self,size,price,side,quote_side):
        if (size > 0)&(side==quote_side):
            self.update({price:size})
        elif (size == 0)&(side==quote_side):
            self.pop(price)
        else:
            self.ignore_item
        return self
    
    def get_bbo(self,side):
        if (side == 'B')&(len(self)>0):
            return max(self.keys())
        elif (side == 'S')&(len(self)>0):
            return min(self.keys())
        else:
            return None
    
    def get_vwap(self):
        if len(self)==0:
            return None
        volume = self
        return sum([k*v for k,v in self.items()])/sum([v for v in self.values()])
            
    def update_prices(self,size,price,side,quote_side):
        self.add_or_discard(size,price,side,quote_side)
        return PriceLevels(self.copy())
In [59]:
a = PriceLevels({45.45:100,50:100,55:50})
print('set price levels: ',a)
a.update_prices(100,46.05,'B','B')
print('update price level same side: ',a)
a.update_prices(100,46.10,'S','B')
print('update price level opposite side: ',a)
a.update_prices(0,50,'B','B')
print('remove price same side: ',a)
print('get the best bid: ',a.get_bbo('B'))
print('get vwap: ',a.get_vwap())
set price levels:  {45.45: 100, 50: 100, 55: 50}
update price level same side:  {45.45: 100, 50: 100, 55: 50, 46.05: 100}
update price level opposite side:  {45.45: 100, 50: 100, 55: 50, 46.05: 100}
remove price same side:  {45.45: 100, 55: 50, 46.05: 100}
get the best bid:  55
get vwap:  47.6
In [60]:
#For each timestamp, find current sizes of each available price, the best bid and offer,
#as well as the VWAP (value weighted average price) or the buy orders, sell orders, and all orders
bid = PriceLevels()
ofr = PriceLevels()
quotes = dict()
#Use iterrows()
for row in order_df.iterrows():
    timestamp,size,price,side = row[1]
    quotes[timestamp] = {'bid':bid.update_prices(size,price,side,'B'),
                         'ofr':ofr.update_prices(size,price,side,'S'),
                         'best_bid':bid.get_bbo('B'),
                         'bid_vwap':bid.get_vwap(),
                         'best_ofr':ofr.get_bbo('S'),
                         'ofr_vwap':ofr.get_vwap(),
                         'avg_vwap':PriceLevels({**bid,**ofr}).get_vwap(),
                         'date':timestamp.date()}
In [63]:
list(quotes.items())[500:504]
Out[63]:
[(Timestamp('2021-01-22 09:31:48.205629'),
  {'bid': {24.98: 100},
   'ofr': {43.61: 300},
   'best_bid': 24.98,
   'bid_vwap': 24.98,
   'best_ofr': 43.61,
   'ofr_vwap': 43.61,
   'avg_vwap': 38.9525,
   'date': datetime.date(2021, 1, 22)}),
 (Timestamp('2021-01-22 09:31:48.320642'),
  {'bid': {24.98: 100},
   'ofr': {43.61: 300, 43.59: 300},
   'best_bid': 24.98,
   'bid_vwap': 24.98,
   'best_ofr': 43.59,
   'ofr_vwap': 43.6,
   'avg_vwap': 40.94,
   'date': datetime.date(2021, 1, 22)}),
 (Timestamp('2021-01-22 09:31:48.320661'),
  {'bid': {24.98: 100},
   'ofr': {43.59: 300},
   'best_bid': 24.98,
   'bid_vwap': 24.98,
   'best_ofr': 43.59,
   'ofr_vwap': 43.59,
   'avg_vwap': 38.93750000000001,
   'date': datetime.date(2021, 1, 22)}),
 (Timestamp('2021-01-22 09:31:48.325218'),
  {'bid': {24.98: 100},
   'ofr': {43.6: 300},
   'best_bid': 24.98,
   'bid_vwap': 24.98,
   'best_ofr': 43.6,
   'ofr_vwap': 43.6,
   'avg_vwap': 38.945,
   'date': datetime.date(2021, 1, 22)})]
In [64]:
#Create a dataframe of current in the order book
quote_df = (pd.DataFrame
            .from_dict(quotes,orient='index')
            .reset_index()
            .rename(columns={'index':'timestamp'})
            .dropna(subset=['best_bid','best_ofr'],how='all'))

display(quote_df)
timestamp bid ofr best_bid bid_vwap best_ofr ofr_vwap avg_vwap date
0 2021-01-22 08:25:48.218283 {44.45: 1000} {} 44.45 44.45 NaN NaN 44.45000 2021-01-22
2 2021-01-22 08:44:23.578790 {44.04: 1000} {} 44.04 44.04 NaN NaN 44.04000 2021-01-22
4 2021-01-22 08:49:02.526710 {} {43.66: 100} NaN NaN 43.66 43.66000 43.66000 2021-01-22
6 2021-01-22 08:49:28.189795 {43.34: 100} {} 43.34 43.34 NaN NaN 43.34000 2021-01-22
8 2021-01-22 08:51:11.786607 {43.41: 100} {} 43.41 43.41 NaN NaN 43.41000 2021-01-22
... ... ... ... ... ... ... ... ... ...
342613 2021-01-28 16:56:26.486624 {} {988.0: 112} NaN NaN 988.00 988.00000 988.00000 2021-01-28
342614 2021-01-28 16:59:06.409716 {} {988.0: 112, 262.0: 100} NaN NaN 262.00 645.54717 645.54717 2021-01-28
342615 2021-01-28 16:59:06.793899 {} {988.0: 112} NaN NaN 988.00 988.00000 988.00000 2021-01-28
342616 2021-01-28 16:59:06.797557 {} {988.0: 112, 262.0: 100} NaN NaN 262.00 645.54717 645.54717 2021-01-28
342617 2021-01-28 16:59:08.241677 {} {988.0: 112} NaN NaN 988.00 988.00000 988.00000 2021-01-28

342023 rows × 9 columns

In [65]:
#Get just the trades from the original dataframe
trade_df = json_df.loc[json_df['type']=='trade_report',:]
trade_df
Out[65]:
type timestamp symbol flags size price trade_id side date
0 trade_report 2021-01-22 08:02:31.215300 GME 96 39 45.19 2.067095e+06 X 2021-01-22
1 trade_report 2021-01-22 08:08:14.700160 GME 96 50 44.87 2.639914e+06 X 2021-01-22
2 trade_report 2021-01-22 08:11:52.294756 GME 96 50 44.58 3.063945e+06 X 2021-01-22
3 trade_report 2021-01-22 08:18:22.383301 GME 96 50 44.04 3.669247e+06 X 2021-01-22
4 trade_report 2021-01-22 08:19:09.002873 GME 96 10 43.78 3.739332e+06 X 2021-01-22
... ... ... ... ... ... ... ... ... ...
490754 trade_report 2021-01-28 16:54:05.085758 GME 96 5 257.58 3.026635e+09 X 2021-01-28
490755 trade_report 2021-01-28 16:54:05.592947 GME 96 22 257.58 3.026636e+09 X 2021-01-28
490759 trade_report 2021-01-28 16:56:26.486624 GME 224 1 264.00 3.026962e+09 X 2021-01-28
490762 trade_report 2021-01-28 16:59:06.793899 GME 192 100 262.00 3.027353e+09 X 2021-01-28
490765 trade_report 2021-01-28 16:59:08.241677 GME 96 19 262.00 3.027362e+09 X 2021-01-28

134712 rows × 9 columns

In [67]:
#Combine the two dataframes so that each trade report is associated with a set of 'prevailing quotes'
plot_df = (pd.concat([quote_df,trade_df],ignore_index=False)
             .fillna({'type':'quote','symbol':'GME'})
             .sort_values(['timestamp','type'])
             .drop('side',axis=1)
             .ffill()
             .dropna(subset=['best_bid','best_ofr'])
             .astype({'date':'string'}))

plot_df = plot_df.loc[plot_df['type']=='trade_report',:]

plot_df['midpt'] = (plot_df['best_bid']+plot_df['best_ofr'])/2
In [68]:
def make_plot(plot_df,date):
    import plotly.graph_objects as go
    daily_df = plot_df.loc[plot_df.date.astype('string')==date,:]
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                             y=daily_df['best_bid'],
                             mode='markers',
                             marker_color='blue',
                             opacity = 0.75,
                             line_shape='hv',
                             name = 'Best Bid'))

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                               y=daily_df['bid_vwap'],
                               mode='markers',
                               opacity=0.75,
                               marker_color='lightblue',
                               name='VWAP of Bids',
                               visible='legendonly'))

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                             y=daily_df['best_ofr'],
                             mode='markers',
                             marker_color='red',
                             opacity=0.75,
                             line_shape='hv',
                             name = 'Best Offer'))

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                               y=daily_df['ofr_vwap'],
                               mode='markers',
                               opacity=0.75,
                               marker_color='pink',
                               name='VWAP of offers',
                               visible='legendonly'))

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                             y=daily_df['price'],
                             mode='markers',
                             marker_color='green',
                             marker_symbol = 'x',
                             name = 'Trades'))

    fig.add_trace(go.Scatter(x=daily_df['timestamp'],
                             y=daily_df['avg_vwap'],
                             mode = 'lines',
                             line_color = 'orange',
                             name='VWAP of Quotes'))
    
    fig.update_layout(xaxis_tickformat = "%H:%M:%S.%f",
                      xaxis_title='Time',
                      yaxis_title='Price',
                      title='Quotes and Trades on IEX for '+date,
                      template='plotly_dark')

    return fig
In [69]:
fig = make_plot(plot_df,"2021-01-22")
Image(fig.to_image(format="png"))
Out[69]:
In [70]:
fig = make_plot(plot_df,"2021-01-25")
Image(fig.to_image(format="png"))
Out[70]:
In [71]:
fig = make_plot(plot_df,"2021-01-26")
fig.add_annotation(x=pd.to_datetime('2021-01-26T16:07:00'),
                   y=150,
                   text="Gamestonk!",
                   showarrow=True,
                   arrowhead=4)
Image(fig.to_image(format="png"))
Out[71]:
In [72]:
make_plot(plot_df,"2021-01-27")
Image(fig.to_image(format="png"))
Out[72]:
In [73]:
make_plot(plot_df,"2021-01-28")
Image(fig.to_image(format="png"))
Out[73]:
In [ ]:
melt_df = plot_df.melt(id_vars=['timestamp','price','date'],
                       value_vars = ['avg_vwap','midpt'],
                       value_name='Quote',
                       var_name='Type')
melt_df
In [ ]:
import plotly.express as px



fig = px.scatter(melt_df,
                 x='Quote',
                 y='price',
                 opacity = 0.3,
                 color='date',
                 facet_col='Type')

for attribute in fig.layout.annotations:
    conversion_dict = {'Type=avg_vwap':'Average VWAP of Quotes',
                       'Type=midpt':'Midpoint of BBO'}
    attribute.text = conversion_dict[attribute.text]
    
fig.update_layout(xaxis_title='Quote Price',
                  xaxis2_title='Quote Price',
                  yaxis_title='Trade Price',
                  legend_title='Date',
                  title='Actual Trade Price vs Quote Prices',
                  template='plotly_dark')

fig.add_shape(type="line",
              x0=0, y0=0, x1=500, y1=500,
              line=dict(color="white",width=3),
              row='all',col='all');
In [ ]:
fig.show()
img_bytes = fig.to_image(format="png")
Image(img_bytes)